% File src/library/stats/man/aggregate.Rd
% Part of the R package, http://www.R-project.org
% Copyright 1995-2008 R Core Development Team
% Distributed under GPL 2 or later

\name{aggregate}
\alias{aggregate}
\alias{aggregate.default}
\alias{aggregate.data.frame}
\alias{aggregate.ts}
\title{Compute Summary Statistics of Data Subsets}
\usage{
aggregate(x, \dots)

\method{aggregate}{default}(x, \dots)

\method{aggregate}{data.frame}(x, by, FUN, \dots)

\method{aggregate}{ts}(x, nfrequency = 1, FUN = sum, ndeltat = 1,
          ts.eps = getOption("ts.eps"), \dots)
}
\description{
  Splits the data into subsets, computes summary statistics for each,
  and returns the result in a convenient form.
}
\arguments{
  \item{x}{an R object.}
  \item{by}{a list of grouping elements, each as long as the variables
    in \code{x}.}
  \item{FUN}{a scalar function to compute the summary statistics which
    can be applied to all data subsets.}
  \item{nfrequency}{new number of observations per unit of time; must
    be a divisor of the frequency of \code{x}.}
  \item{ndeltat}{new fraction of the sampling period between
    successive observations; must be a divisor of the sampling
    interval of \code{x}.}
  \item{ts.eps}{tolerance used to decide if \code{nfrequency} is a
    sub-multiple of the original frequency.}
  \item{\dots}{further arguments passed to or used by methods.}
}
\details{
  \code{aggregate} is a generic function with methods for data frames
  and time series.
  
  The default method \code{aggregate.default} uses the time series
  method if \code{x} is a time series, and otherwise coerces \code{x}
  to a data frame and calls the data frame method.

  \code{aggregate.data.frame} is the data frame method.  If \code{x}
  is not a data frame, it is coerced to one, which must have a non-zero
  number of rows.  Then, each of the
  variables (columns) in \code{x} is split into subsets of cases
  (rows) of identical combinations of the components of \code{by}, and
  \code{FUN} is applied to each such subset with further arguments in
  \code{\dots} passed to it.
  (I.e., \code{tapply(VAR, by, FUN, \dots, simplify = FALSE)} is done
  for each variable \code{VAR} in \code{x}, conveniently wrapped into
  one call to \code{lapply()}.)
  Empty subsets are removed, and the result is reformatted into a data
  frame containing the variables in \code{by} and \code{x}.  The ones
  arising from \code{by} contain the unique combinations of grouping
  values used for determining the subsets, and the ones arising from
  \code{x} the corresponding summary statistics for the subset of the
  respective variables in \code{x}.  Rows with missing values in any of
  the \code{by} variables will be omitted from the result.
  
  \code{aggregate.ts} is the time series method.  If \code{x} is not a
  time series, it is coerced to one.  Then, the variables in \code{x}
  are split into appropriate blocks of length
  \code{frequency(x) / nfrequency}, and \code{FUN} is applied to each
  such block, with further (named) arguments in \code{\dots} passed to
  it.  The result returned is a time series with frequency
  \code{nfrequency} holding the aggregated values.  Note that this make
  most sense for a quarterly or yearly result when the original
  series covers a whole number of quarters or years: in particular
  aggregating a monthly series to quarters starting in February does not
  give a conventional quarterly series.
  
  \code{FUN} is passed to \code{\link{match.fun}}, and hence it can be a
  function or a symbol or character string naming a function.
}
\value{
  For the time series method, a time series of class \code{"ts"} or
  class \code{c("mts", "ts")}.

  For the data frame method, a data frame with columns
  corresponding to the grouping variables in \code{by} followed by
  aggregated columns from \code{x}.  If the \code{by} has names, the
  non-empty times are used to label the columns in the results, with
  unnamed grouping variables being named \code{Group.\var{i}} for
  \code{by[[\var{i}]]}.

  \strong{Note:} prior to \R 2.6.0
  the grouping variables were reported as factors with levels in
  alphabetical order in the current locale.  Now the variable in the
  result is found by subsetting the original variable.
}
\author{Kurt Hornik}
\seealso{
  \code{\link{apply}}, \code{\link{lapply}}, \code{\link{tapply}}.
}
\references{
  Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
  \emph{The New S Language}.
  Wadsworth & Brooks/Cole.
}
\examples{
## Compute the averages for the variables in 'state.x77', grouped
## according to the region (Northeast, South, North Central, West) that
## each state belongs to.
aggregate(state.x77, list(Region = state.region), mean)

## Compute the averages according to region and the occurrence of more
## than 130 days of frost.
aggregate(state.x77,
          list(Region = state.region,
               Cold = state.x77[,"Frost"] > 130),
          mean)
## (Note that no state in 'South' is THAT cold.)


## example with character variables and NAs
testDF <- data.frame(v1 = c(1,3,5,7,8,3,5,NA,4,5,7,9),
                     v2 = c(11,33,55,77,88,33,55,NA,44,55,77,99) )
by1 <- c("red","blue",1,2,NA,"big",1,2,"red",1,NA,12)
by2 <- c("wet","dry",99,95,NA,"damp",95,99,"red",99,NA,NA)
aggregate(x = testDF, by = list(by1, by2), FUN = "mean")

# and if you want to treat NAs as a group
fby1 <- factor(by1, exclude = "")
fby2 <- factor(by2, exclude = "")
aggregate(x = testDF, by = list(fby1, fby2), FUN = "mean")


## Compute the average annual approval ratings for American presidents.
aggregate(presidents, nfrequency = 1, FUN = mean)
## Give the summer less weight.
aggregate(presidents, nfrequency = 1,
          FUN = weighted.mean, w = c(1, 1, 0.5, 1))
}
\keyword{category}
\keyword{array}
